library(tidyverse)
library(fs)
ngsadmix_dir <- "data/ngsadmix_all/maf_0.05" # set NGSadmix outputs directory
N_K <- 10 # set number of K run
N_reps <- 4 # set number of reps run
# pull all log files
log_files <- list.files(ngsadmix_dir, pattern = ".log", full.names = T, recursive=T)
# read in all logs
all_logs <- lapply(1:length(log_files), FUN = function(i) readLines(log_files[i]))
# make list of the line that starts with "best like=" from all logs, just target 'b'
library(stringr)
bestlikes_str_list <- sapply(1:length(log_files), FUN= function(x) all_logs[[x]][which(str_sub(all_logs[[x]], 1, 1) == 'b')])
# make dataframe with 1:N_K and N_reps to add likelihood values
loglikes <- data.frame(K = rep(2:N_K, each=N_reps))
# add the log likelihood (first number in the string)
loglikes$loglike<-as.vector(as.numeric( sub("\\D*(\\d+).*", "\\1", bestlikes_str_list) ))
tapply(loglikes$loglike, loglikes$K, FUN= function(x) mean(abs(x))/sd(abs(x)))
2 3 4 5 6 7
1.314224e+03 3.724947e+08 Inf Inf 5.753390e+02 6.269528e+03
8 9 10
1.961828e+04 Inf Inf
alewife_palette <- c("#636c62", "#869ca8", "#a2878a", "#b6bc9f", "#ecc2a3", "#eceeed", "#242b35", "#a39faa", "#ac9b7c", "#3b3a35")
qcolors <- c(`Q1` = "#636c62",
`Q2` = "#869ca8",
`Q3` = "#a2878a",
`Q4` = "#b6bc9f",
`Q5` = "#ecc2a3",
`Q6` = "#eceeed",
`Q7` = "#242b35",
`Q8` = "#a39faa",
`Q9` = "#ac9b7c",
`Q10` = "#3b3a35")
gord <- c("BLUE",
"HYBR",
"GRTL",
"FINL",
"MIDL",
"CONL",
"NATLA",
"MIDA")
word <- c("Petitcodiac River",
"Hudson River",
"Lake Yonah",
"Lake Hartwell",
"Altamaha River",
"Roanoke River",
"Lake Superior",
"Lake Michigan",
"Lake Ontario",
"Canandaigua Lake",
"Cayuga Lake",
"Seneca Lake",
"Otisco Lake",
"East Grand Lake",
"Lake Champlain",
"Pattagansett Lake",
"Rogers Lake",
"Quonnipaug Lake",
"Miramichi River",
"Saco River",
"Black Creek",
"Choptank River")
metadata <- read_csv("data/herring_metadata.csv") %>%
select(NMFS_DNA_ID,
GENUS,
SPECIES,
STATE_F,
WATERSHED,
WATER_NAME,
grouping_v3) %>%
mutate(newname = paste0(WATER_NAME,
"-",
NMFS_DNA_ID)) %>%
mutate(newname = str_replace_all(newname,
" +",
"_")) %>%
mutate(gfact = factor(grouping_v3,
levels = gord),
wfact = factor(WATER_NAME,
levels = word)) %>%
arrange(gfact, wfact)
Rows: 94 Columns: 12── Column specification ──────────────────────────────────────────────────────────
Delimiter: ","
chr (11): sample, NMFS_DNA_ID, BOX_ID, BOX_POSITION, SAMPLE_ID, GENUS, SPECIES...
dbl (1): BATCH_ID
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nord <- unique(metadata$newname)
labels <- read_tsv("data/admix-labels.tsv")
Rows: 23 Columns: 4── Column specification ──────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): grouping_v3, group_label, WATER_NAME, pop_label
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tmp <- metadata %>%
mutate(xpos = 1:n())
group_pos <- tmp %>%
group_by(grouping_v3) %>%
summarise(midx = (min(xpos) - 0.5 + max(xpos) + 0.5)/ 2,
linex = max(xpos) + 0.5) %>%
mutate(midy = 1)
group_labels <- group_pos %>%
left_join(labels %>% group_by(grouping_v3) %>% slice(1)) %>%
mutate(gfact = factor(grouping_v3,
levels = gord)) %>%
arrange(gfact)
Joining with `by = join_by(grouping_v3)`
pop_pos <- tmp %>%
group_by(grouping_v3, WATER_NAME) %>%
summarise(midx = (min(xpos) - 0.5 + max(xpos) + 0.5) / 2,
linex = max(xpos) + 0.5) %>%
mutate(midy = 0)
`summarise()` has grouped output by 'grouping_v3'. You can override using the `.groups` argument.
pop_labels <- pop_pos %>%
left_join(labels)
Joining with `by = join_by(grouping_v3, WATER_NAME)`
ngsadmix_files <- dir_ls("data/ngsadmix_all",
recurse = TRUE,
glob = "*.qopt_with_sample_names")
ngsAdmix_tib <- lapply(ngsadmix_files, function(x) {
read.table(x,
header = TRUE) %>%
pivot_longer(cols = -sample,
names_to = "Qval",
values_to = "value") %>%
mutate(path = x,
.before = sample)
}) %>%
bind_rows() %>%
filter(!is.na(value)) %>%
mutate(Qval = str_replace(Qval,
"X",
"Q")) %>%
extract(path,
into = c("K",
"rep"),
regex = ".*K_([0-9]+)_rep_([0-9]+)/.*$",
convert = TRUE) %>%
inner_join(.,
metadata,
by = c("sample" = "NMFS_DNA_ID"),
relationship = "many-to-many") %>%
mutate(gfact = factor(grouping_v3,
levels = gord),
wfact = factor(WATER_NAME,
levels = word)) %>%
arrange(gfact, wfact)
ggplot(ngsAdmix_tib) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
K2 <- ngsAdmix_tib %>%
filter(K == 2)
K3 <- ngsAdmix_tib %>%
filter(K == 3)
K4 <- ngsAdmix_tib %>%
filter(K == 4)
K5 <- ngsAdmix_tib %>%
filter(K == 5)
K6 <- ngsAdmix_tib %>%
filter(K == 6)
K7 <- ngsAdmix_tib %>%
filter(K == 7)
K8 <- ngsAdmix_tib %>%
filter(K == 8)
K9 <- ngsAdmix_tib %>%
filter(K == 9)
K10 <- ngsAdmix_tib %>%
filter(K == 10)
ggplot(K2) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In the admixture scenario of 2 source populations, the samples split into the two species of blueback (left) and alewife (right).
ggplot(K3) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In the admixture scenario of 3 source populations, the Great and Finger Lakes samples pull out into their own group.
ggplot(K4) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In the admixture scenario of four source populations, we see the the Roanoke River, East Grand Lake, and Lake Champlain samples pull out into their own group.
ggplot(K5) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In the most admixture scenarios of five source populations, we begin to see Great and Finger Lakes form a nonspecific mosaic except for in the third repetition in which the Connecticut Lakes samples break out into their own group.
ggplot(K6) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In the an admixture scenario of six source populations, we see the Connecticut Lakes samples break out and the same random mosaic in the Great and Finger Lakes samples, except for in the third repetition where only Pattagansett and Rogers Lakes separate out into their own source population.
ggplot(K7) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In an admixture scenario of seven source populations, the anadromous alewife show their own source population.
ggplot(K8) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In an admixture scenario of eight source populations, blueback samples begin to mosaic as well as the anadromous samples with the Quonnipaug Lake samples breaking out with the source population more present in the Mid-Atlantic managing unit.
ggplot(K9) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In an admixture scenario of nine source populations, all blueback samples come from the same source population, three source populations mosaic through the Great and Finger Lakes, Rogers Lake breaks out into their own source population, and the anadromous samples mosaic into two nonspecific source populations.
ggplot(K10) +
geom_col(mapping = aes(x = factor(newname,
levels = nord),
y = value,
fill = Qval)) +
scale_fill_manual(values = qcolors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
size = 8,
vjust = 0.5)) +
facet_grid(rep ~ K) +
labs(x = "",
y = "Q value") +
geom_vline(xintercept = pop_labels$linex,
linetype = 2,
color = "gray25") +
geom_vline(xintercept = group_labels$linex)
In an admixture scenario of ten source populations, 1-2 Lake Champlain samples break out with Roanoke River samples, with all other Lake Champlain and East Grand Lake samples in their own source population. Pattagansett Lake breaks out from the other Connecticut Lakes samples.